{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 0.19661193 -0.19661193]\n",
      " [-0.19661193  0.19661193]]\n",
      "[-0.39322387  0.39322387]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "def softmax(Z):\n",
    "    exp_Z = np.exp(Z-np.max(Z,axis=1,keepdims=True))\n",
    "    return exp_Z/np.sum(exp_Z,axis=1,keepdims=True)\n",
    "\n",
    "def softmax_gradient(z,isF = False):   \n",
    "    if isF:\n",
    "        f = z\n",
    "    else:\n",
    "        f = softmax(z)\n",
    "    grad = -np.outer(f, f) + np.diag(f.flatten())\n",
    "    return grad\n",
    "\n",
    "def softmax_backward(z,df,isF = False):     \n",
    "    grad = softmax_gradient(z,isF)    \n",
    "    return df@grad\n",
    "\n",
    "x = np.array([[1, 2]])\n",
    "print(softmax_gradient(x))\n",
    "df = np.array([1, 3])\n",
    "print(softmax_backward(x,df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[[ 0.19661193 -0.19661193]\n",
      "  [-0.19661193  0.19661193]]\n",
      "\n",
      " [[ 0.04517666 -0.04517666]\n",
      "  [-0.04517666  0.04517666]]]\n"
     ]
    }
   ],
   "source": [
    "def softmax_gradient(z,isF = False): \n",
    "    if isF:\n",
    "        f = z\n",
    "    else:\n",
    "        f = softmax(z)\n",
    "    \n",
    "    if len(df)==1:\n",
    "        return -np.outer(f, f) + np.diag(f.flatten())\n",
    "    else:  \n",
    "        grads = []\n",
    "        for i in range(len(f)):\n",
    "            fi = f[i]\n",
    "            grad = -np.outer(fi, fi) + np.diag(fi.flatten())  \n",
    "            grads.append(grad)\n",
    "        return np.array(grads)\n",
    "\n",
    "x = np.array([[1, 2],[2, 5]])\n",
    "print(softmax_gradient(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[[ 0.19661193 -0.19661193]\n",
      "  [-0.19661193  0.19661193]]\n",
      "\n",
      " [[ 0.04517666 -0.04517666]\n",
      "  [-0.04517666  0.04517666]]]\n"
     ]
    }
   ],
   "source": [
    "def softmax_gradient(Z,isF = False):  \n",
    "    if isF:\n",
    "        F = Z\n",
    "    else:\n",
    "        F = softmax(Z)   \n",
    "    D = []\n",
    "    for i in range(F.shape[0]):\n",
    "        f = F[i]\n",
    "        D.append(np.diag(f.flatten()))\n",
    "    grads = D-np.einsum('ij,ik->ijk',F,F)\n",
    "    return grads\n",
    "\n",
    "print(softmax_gradient(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-0.39322387  0.39322387]\n",
      " [-0.09035332  0.09035332]]\n"
     ]
    }
   ],
   "source": [
    "def softmax_backward(Z,dF,isF = True):     \n",
    "    grads = softmax_gradient(Z,isF)    \n",
    "    grad = np.einsum(\"bj, bjk -> bk\", dF, grads)  # [B,D]*[B,D,D] -> [B,D]\n",
    "    return grad\n",
    "\n",
    "df = np.array([[1, 3],[2, 4]])\n",
    "print(softmax_backward_2(x,df))     "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'torch'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-38-bcae64ed95a7>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnn\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnn\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunctional\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mF\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[0mB\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mD\u001b[0m\u001b[1;33m=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'torch'"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "B,D= 1,3\n",
    "a = torch.randn([B,D],requires_grad=True)\n",
    "print(\"a\",a)\n",
    "b = F.softmax(a)\n",
    "print(\"b\",b)\n",
    "\n",
    "db = torch.randn([B,D])\n",
    "b.backward(db)\n",
    "print(\"a.grad\",a.grad)\n",
    "\n",
    "a_ = a.detach().numpy()\n",
    "db_ = db.detach().numpy()\n",
    "da = softmax_backward(a_,db,False)\n",
    "print(\"a.grad\",da)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0.26894142 0.73105858]\n",
      " [0.04742587 0.95257413]]\n",
      "[[[ 0.19661193 -0.19661193]\n",
      "  [-0.19661193  0.19661193]]\n",
      "\n",
      " [[ 0.04517666 -0.04517666]\n",
      "  [-0.04517666  0.04517666]]]\n",
      "[[-0.39322387  0.39322387]\n",
      " [-0.09035332  0.09035332]]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "\n",
    "def softmax_gradient(z): \n",
    "    f = softmax(z)\n",
    "    grads = []\n",
    "    for i in range(len(f)):\n",
    "        fi = f[i]\n",
    "        grad = -np.outer(fi, fi) + np.diag(fi.flatten())  \n",
    "        grads.append(grad)\n",
    "    return np.array(grads)\n",
    "\n",
    "    \n",
    "def softmax_backward(z,df):    \n",
    "    f = softmax(z)\n",
    "    #print(\"f.shape\",f.shape)\n",
    "    #print(\"df.shape\",df.shape)\n",
    "    if len(df)==1:\n",
    "        grad = -np.outer(f, f) + np.diag(f.flatten())\n",
    "        return df@grad \n",
    "    else:\n",
    "        grads = []\n",
    "        for i in range(len(f)):\n",
    "            fi = f[i]\n",
    "            grad = -np.outer(fi, fi) + np.diag(fi.flatten())  \n",
    "            grads.append(df[i]@grad)\n",
    "        return np.array(grads)\n",
    "    #return df@grad\n",
    "\n",
    "x = np.array([[1, 2],[2, 5]])\n",
    "print(softmax(x))\n",
    "print(softmax_gradient(x))\n",
    "df = np.array([[1, 3],[2, 4]])\n",
    "print(softmax_backward(x,df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[[ -1  -2]\n",
      "  [ -2  -4]]\n",
      "\n",
      " [[ -4 -10]\n",
      "  [-10 -25]]]\n",
      "[[[ -1  -2]\n",
      "  [ -2  -4]]\n",
      "\n",
      " [[ -4 -10]\n",
      "  [-10 -25]]]\n"
     ]
    }
   ],
   "source": [
    "def out_product(F):\n",
    "    grads=[]\n",
    "    for i in range(F.shape[0]):\n",
    "        f = F[i]\n",
    "        grad = -np.outer(f, f) # -np.outer(f, f) + np.diag(f.flatten()) \n",
    "        grads.append(grad)\n",
    "    grads = np.array(grads)\n",
    "    return grads\n",
    " \n",
    "x = np.array([[1, 2],[2, 5]])\n",
    "print(out_product(x))\n",
    "print( -np.einsum('ij,ik->ijk',x,x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[-0.39322387  0.39322387]\n",
      " [-0.09035332  0.09035332]]\n"
     ]
    }
   ],
   "source": [
    "def softmax_backward_2(Z,dF,isF = True):  \n",
    "    if isF:\n",
    "        F = Z\n",
    "    else:\n",
    "        F = softmax(Z)   \n",
    "    D = []\n",
    "    for i in range(F.shape[0]):\n",
    "        f = F[i]\n",
    "        D.append(np.diag(f.flatten()))\n",
    "    grads = D-np.einsum('ij,ik->ijk',F,F)     \n",
    "    grad = np.einsum(\"bj, bjk -> bk\", dF, grads)  # [B,D]*[B,D,D] -> [B,D]\n",
    "    return grad\n",
    "\n",
    "print(softmax_backward_2(x,df))        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x: [[0 1 2]\n",
      " [3 4 5]]\n",
      "y: [[0 1 2]\n",
      " [3 4 5]]\n",
      "x: [[ 0  2  4]\n",
      " [ 6  8 10]]\n"
     ]
    }
   ],
   "source": [
    "x = np.arange(6).reshape((2, 3))\n",
    "y = np.arange(6).reshape((2, 3))\n",
    "print(\"x:\",x)\n",
    "print(\"y:\",y)\n",
    "x+=y\n",
    "print(\"x:\",x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
